{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "import string\n",
    "import xgboost as xgb\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn import ensemble, metrics, model_selection, naive_bayes\n",
    "color = sns.color_palette()\n",
    "\n",
    "eng_stopwords = set(stopwords.words(\"english\"))\n",
    "pd.options.mode.chained_assignment = None\n",
    "\n",
    "from sklearn.model_selection import KFold,cross_val_score\n",
    "%matplotlib inline\n",
    "\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "from sklearn import model_selection, preprocessing, metrics\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "import lightgbm as lgb\n",
    "\n",
    "from sklearn.ensemble import  RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor\n",
    "from lightgbm import LGBMRegressor\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.linear_model import LinearRegression, Ridge,RidgeCV,LarsCV,LassoCV,ElasticNetCV,LassoLarsCV\n",
    "!ls ../input/mh-the-price-of-books/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
   },
   "outputs": [],
   "source": [
    "train = pd.read_excel('../input/mh-the-price-of-books/Data_Train.xlsx')\n",
    "test = pd.read_excel('../input/mh-the-price-of-books/Data_Test.xlsx')\n",
    "ss = pd.read_excel('../input/mh-the-price-of-books/Sample_Submission.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.shape,test.shape,ss.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.distplot(np.log(train['Price']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in test.columns:\n",
    "    print(i,':',train[i].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['BookCategory'].value_counts().index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l1 = ['Action & Adventure', 'Crime, Thriller & Mystery',\n",
    "       'Biographies, Diaries & True Accounts',\n",
    "       'Language, Linguistics & Writing', 'Comics & Mangas', 'Romance',\n",
    "       'Humour', 'Arts, Film & Photography',\n",
    "       'Computing, Internet & Digital Media', 'Sports', 'Politics']\n",
    "l2 = list(range(len(l1)))\n",
    "d = dict(zip(l1,l2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['BookCategory'] = train['BookCategory'].map(d)\n",
    "test['BookCategory'] = test['BookCategory'].map(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.concat([train,pd.get_dummies(train['BookCategory'],prefix ='BookCategory',drop_first=True)],axis=1)\n",
    "test = pd.concat([test,pd.get_dummies(test['BookCategory'],prefix ='BookCategory',drop_first=True)],axis=1)\n",
    "                  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop('BookCategory',axis=1,inplace=True)\n",
    "test.drop('BookCategory',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train['Reviews'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train['Ratings'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Title'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[train.Title == 'A Game of Thrones (A Song of Ice and Fire)']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['reviews'] = train['Reviews'].apply(lambda x: x.split()[0])\n",
    "test['reviews'] = test['Reviews'].apply(lambda x: x.split()[0])\n",
    "\n",
    "train['ratings'] = train['Ratings'].apply(lambda x: x.split()[0])\n",
    "test['ratings'] = test['Ratings'].apply(lambda x: x.split()[0])\n",
    "\n",
    "train.drop(['Reviews','Ratings'],axis=1,inplace=True)\n",
    "test.drop(['Reviews','Ratings'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Number of words in the text ##\n",
    "train[\"num_words\"] = train[\"Synopsis\"].apply(lambda x: len(str(x).split()))\n",
    "test[\"num_words\"] = test[\"Synopsis\"].apply(lambda x: len(str(x).split()))\n",
    "\n",
    "## Number of unique words in the text ##\n",
    "train[\"num_unique_words\"] = train[\"Synopsis\"].apply(lambda x: len(set(str(x).split())))\n",
    "test[\"num_unique_words\"] = test[\"Synopsis\"].apply(lambda x: len(set(str(x).split())))\n",
    "\n",
    "## Number of characters in the text ##\n",
    "train[\"num_chars\"] = train[\"Synopsis\"].apply(lambda x: len(str(x)))\n",
    "test[\"num_chars\"] = test[\"Synopsis\"].apply(lambda x: len(str(x)))\n",
    "\n",
    "## Number of stopwords in the text ##\n",
    "train[\"num_stopwords\"] = train[\"Synopsis\"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))\n",
    "test[\"num_stopwords\"] = test[\"Synopsis\"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))\n",
    "\n",
    "## Number of punctuations in the text ##\n",
    "train[\"num_punctuations\"] =train['Synopsis'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )\n",
    "test[\"num_punctuations\"] =test['Synopsis'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )\n",
    "\n",
    "## Number of title case words in the text ##\n",
    "train[\"num_words_upper\"] = train[\"Synopsis\"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))\n",
    "test[\"num_words_upper\"] = test[\"Synopsis\"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))\n",
    "\n",
    "## Number of title case words in the text ##\n",
    "train[\"num_words_title\"] = train[\"Synopsis\"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))\n",
    "test[\"num_words_title\"] = test[\"Synopsis\"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))\n",
    "\n",
    "## Average length of the words in the text ##\n",
    "train[\"mean_word_len\"] = train[\"Synopsis\"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))\n",
    "test[\"mean_word_len\"] = test[\"Synopsis\"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Edition_cover'] = train['Edition'].apply(lambda x : x.split(',')[0])\n",
    "test['Edition_cover'] = test['Edition'].apply(lambda x : x.split(',')[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def rare_imputation(X_train, X_test, variable):\n",
    "    \n",
    "    # find the most frequent category\n",
    "    frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]\n",
    "    \n",
    "    # find rare labels\n",
    "    temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))\n",
    "    rare_cat = [x for x in temp.loc[temp<0.007].index.values]\n",
    "    \n",
    "    # create new variables, with Rare labels imputed\n",
    "    \n",
    "    # by the most frequent category\n",
    "    X_train[variable] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])\n",
    "    X_test[variable] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])\n",
    "    \n",
    "    # by adding a new label 'Rare'\n",
    "#     X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 99999, X_train[variable])\n",
    "#     X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 99999, X_test[variable])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rare_imputation(train, test, 'Edition_cover')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Edition_cover'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['Edition_cover'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = {'Paperback':'Paperback', 'Hardcover':'Hardcover', 'Mass Market Paperback':'Mass Market Paperback', '(Chinese)':'Mass Market Paperback'}\n",
    "test['Edition_cover'] = test['Edition_cover'].map(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.concat([train,pd.get_dummies(train['Edition_cover'],prefix ='Edition_cover',drop_first=True)],axis=1)\n",
    "test = pd.concat([test,pd.get_dummies(test['Edition_cover'],prefix ='Edition_cover',drop_first=True)],axis=1)\n",
    "\n",
    "train.drop('Edition_cover',axis=1,inplace=True)\n",
    "test.drop('Edition_cover',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Edition_date'] = train['Edition'].apply(lambda x: x.split()[-1])\n",
    "test['Edition_date'] = test['Edition'].apply(lambda x: x.split()[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Edition_date'].value_counts().index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l1 = ['2018', '2017', '2016', '2015', '2014', '2013', '2019', '2012', '2011',\n",
    "       '2010', '2009', '2008', '2005', '2006', '2007', '2003', '2004', '2002',\n",
    "       '2000', '2001', '1999', '1997', '1994', '1998', '1992', '1995', '1996',\n",
    "       '1993', '1989', '1991', '1986', '1988', '1984', '1990', '1982', '1987',\n",
    "       '1985', '1983', 'Import', '1976', 'set', '1980', '1971', '1975', '1974',\n",
    "       '1977', '1964', 'Print', '1979', '1960', '1978', '1961', '1969', 'NTSC',\n",
    "       '1900', '1925', 'Audiobook', 'Facsimile', '1970', 'Edition',\n",
    "       'Unabridged', '1973', '1981', '1905']\n",
    "l2 = [2018, 2017, 2016, 2015, 2014, 2013, 2019, 2012, 2011,\n",
    "       2010, 2009, 2008, 2005, 2006, 2007, 2003, 2004, 2002,\n",
    "       2000, 2001, 1999, 1997, 1994, 1998, 1992, 1995, 1996,\n",
    "       1993, 1989, 1991, 1986, 1988, 1984, 1990, 1982, 1987,\n",
    "       1985, 1983, 9999, 1976, 9999, 1980, 1971, 1975, 1974,\n",
    "       1977, 1964, 9999, 1979, 1960, 1978, 1961, 1969, 9999,\n",
    "       1900, 1925, 9999, 9999, 1970, 9999,\n",
    "       9999, 1973, 1981, 1905]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = dict(zip(l1,l2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Edition_date'] = train['Edition_date'].map(d)\n",
    "test['Edition_date'] = test['Edition_date'].map(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train['Edition_date'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Genre'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tf = TfidfVectorizer(ngram_range=(1, 1), lowercase=False)\n",
    "train_route = tf.fit_transform(train['Genre'])\n",
    "test_route = tf.transform(test['Genre'])\n",
    "\n",
    "train_route = pd.DataFrame(data=train_route.toarray(), columns=tf.get_feature_names())\n",
    "test_route = pd.DataFrame(data=test_route.toarray(), columns=tf.get_feature_names())\n",
    "\n",
    "train_route.shape,test_route.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tf = TfidfVectorizer(ngram_range=(1, 1), lowercase=False)\n",
    "train_route1 = tf.fit_transform(train['Title'])\n",
    "test_route1 = tf.transform(test['Title'])\n",
    "\n",
    "train_route1 = pd.DataFrame(data=train_route1.toarray(), columns=tf.get_feature_names())\n",
    "test_route1 = pd.DataFrame(data=test_route1.toarray(), columns=tf.get_feature_names())\n",
    "\n",
    "train_route1.shape,test_route1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer\n",
    "tf = TfidfVectorizer(ngram_range=(1, 5), lowercase=False)\n",
    "train_route2 = tf.fit_transform(train['Synopsis'])\n",
    "test_route2 = tf.transform(test['Synopsis'])\n",
    "\n",
    "train_route2 = pd.DataFrame(data=train_route2.toarray(), columns=tf.get_feature_names())\n",
    "test_route2 = pd.DataFrame(data=test_route2.toarray(), columns=tf.get_feature_names())\n",
    "\n",
    "train_route2.shape,test_route2.shape\n",
    "\n",
    "tf = CountVectorizer(ngram_range=(1, 5), lowercase=False)\n",
    "train_route3 = tf.fit_transform(train['Synopsis'])\n",
    "test_route3 = tf.transform(test['Synopsis'])\n",
    "\n",
    "train_route3 = pd.DataFrame(data=train_route3.toarray(), columns=tf.get_feature_names())\n",
    "test_route3 = pd.DataFrame(data=test_route3.toarray(), columns=tf.get_feature_names())\n",
    "\n",
    "train_route3.shape,test_route3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# custom function to run light gbm model\n",
    "def run_lgb(train_X, train_y, val_X, val_y, test_X):\n",
    "    params = {\n",
    "        \"objective\" : \"regression\",\n",
    "        \"metric\" : \"rmse\", \n",
    "        \"num_leaves\" : 30,\n",
    "        \"min_child_samples\" : 100,\n",
    "        \"learning_rate\" : 0.1,\n",
    "        \"bagging_fraction\" : 0.7,\n",
    "        \"feature_fraction\" : 0.5,\n",
    "        \"bagging_frequency\" : 5,\n",
    "        \"bagging_seed\" : 2018,\n",
    "        \"verbosity\" : -1\n",
    "    }\n",
    "    \n",
    "    params ={'learning_rate': 0.04, 'max_depth': 15, 'min_child_samples': 8, 'num_leaves': 55,\"objective\" : \"regression\", \"metric\" : \"rmse\"}\n",
    "    \n",
    "    lgtrain = lgb.Dataset(train_X, label=train_y)\n",
    "    lgval = lgb.Dataset(val_X, label=val_y)\n",
    "    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)\n",
    "    \n",
    "    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)\n",
    "    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)\n",
    "    return pred_test_y, model, pred_val_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.concat([train,train_route,train_route1,train_route2,train_route3],axis=1)\n",
    "test_df = pd.concat([test,test_route,test_route1,test_route2,test_route3],axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.shape,test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_cols = list(train_df.select_dtypes(include=['object']).columns)\n",
    "cat_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df[cat_cols].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drop_col = ['Title','Edition','Synopsis','Genre']\n",
    "train_df.drop(drop_col,axis=1,inplace=True)\n",
    "test_df.drop(drop_col,axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df1 = train_df[['Author','ratings','reviews']].copy()\n",
    "test_df1 = test_df[['Author','ratings','reviews']].copy()\n",
    "train_df.drop('Author',axis=1,inplace=True)\n",
    "test_df.drop('Author',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df1 = train_df1.iloc[:,[1,3,4]]\n",
    "test_df1 = test_df1.iloc[:,[1,3,4]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df1['reviews'] = train_df1['reviews'].apply(lambda x : float(x))\n",
    "test_df1['reviews'] = test_df1['reviews'].apply(lambda x : float(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df1['ratings'] = train_df1['ratings'].apply(lambda x : x.replace(',',''))\n",
    "test_df1['ratings'] = test_df1['ratings'].apply(lambda x : x.replace(',',''))\n",
    "\n",
    "train_df1['ratings'] = train_df1['ratings'].apply(lambda x : int(x))\n",
    "test_df1['ratings'] = test_df1['ratings'].apply(lambda x : int(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.drop('ratings\treviews'.split(),axis=1,inplace=True)\n",
    "test_df.drop('ratings\treviews'.split(),axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.concat([train_df,train_df1],axis=1)\n",
    "test_df = pd.concat([test_df,test_df1],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.shape,test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['Price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = train_df['Price'].iloc[:,0]\n",
    "train_df.drop('Price',axis=1,inplace=True)\n",
    "test_df.drop('Price',axis=1,inplace=True)\n",
    "train_y = np.log1p(y)\n",
    "X = train_df.copy()\n",
    "y=np.log1p(y)\n",
    "X_test = test_df.copy()\n",
    "train_df.shape,test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set(train_df.columns) - set(test_df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_X, val_X, dev_y, val_y = model_selection.train_test_split(X, y, test_size=0.25, random_state=42)\n",
    "\n",
    "random_seed = 2019\n",
    "k = 5\n",
    "fold = list(KFold(k, shuffle = True, random_state = random_seed).split(train))\n",
    "np.random.seed(random_seed)\n",
    "from datetime import datetime\n",
    "\n",
    "print(dev_X.shape,val_X.shape,test_df.shape,val_y.shape,dev_y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Training the model #\n",
    "pred_test, model, pred_val = run_lgb(dev_X, dev_y, val_X, val_y, X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k =[] \n",
    "arr=model.feature_importance()\n",
    "for i in range(len(arr)):\n",
    "    if arr[i] !=0:\n",
    "        k.append(i)\n",
    "\n",
    "len(k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_var = np.array(X.columns)\n",
    "k_var[k].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_var = list(k_var[k])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X[k_var]\n",
    "X_test = X_test[k_var]\n",
    "X.shape,X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X.values\n",
    "X_test = X_test.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import KFold, StratifiedKFold\n",
    "from sklearn import model_selection, preprocessing, metrics\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import os\n",
    "\n",
    "lgb_params1 ={'learning_rate': 0.04, 'max_depth': 15, 'min_child_samples': 8, 'num_leaves': 55,\"objective\" : \"regression\", \"metric\" : \"rmse\"}\n",
    "\n",
    "\n",
    "run_lgb = True\n",
    "print('LGB : ', run_lgb)\n",
    "# modeling\n",
    "#--------------------------------------------------------------------------\n",
    "if run_lgb:\n",
    "    import lightgbm as lgb\n",
    "    def kfold_lgb_xgb():\n",
    "        folds = KFold(n_splits=5, shuffle=True, random_state=7)\n",
    "        \n",
    "        oof_lgb = np.zeros(len(X))\n",
    "        predictions_lgb = np.zeros(len(test_df))\n",
    "\n",
    "        for fold_, (trn_idx, val_idx) in enumerate(folds.split(X)):\n",
    "            trn_data = lgb.Dataset(X[trn_idx], label=y[trn_idx])\n",
    "            val_data = lgb.Dataset(X[val_idx], label=y[val_idx])\n",
    "            \n",
    "            print(\"LGB \" + str(fold_) + \"-\" * 50)\n",
    "            num_round = 30000\n",
    "            clf = lgb.train(lgb_params1, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 100)\n",
    "            oof_lgb[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)\n",
    "            \n",
    "            predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits\n",
    "        \n",
    "\n",
    "        x = []\n",
    "        for i in oof_lgb:\n",
    "            if i < 0:\n",
    "                x.append(0.0)\n",
    "            else:\n",
    "                x.append(i)\n",
    "        cv_lgb = mean_squared_error(x, y)**0.5\n",
    "        cv_lgb = str(cv_lgb)\n",
    "        cv_lgb = cv_lgb[:10]\n",
    "        \n",
    "        pd.DataFrame({'preds': x}).to_csv('lgb_oof_' + cv_lgb + '.csv', index = False)\n",
    "        \n",
    "        print(\"CV_LGB : \", cv_lgb)\n",
    "        return cv_lgb, predictions_lgb\n",
    "        \n",
    "    cv_lgb, lgb_ans = kfold_lgb_xgb()\n",
    "    x = []\n",
    "    for i in lgb_ans:\n",
    "        if i < 0:\n",
    "            x.append(0.0)\n",
    "        else:\n",
    "            x.append(i)\n",
    "    np.save('lgb_ans.npy', x)\n",
    "    test = ss.copy()\n",
    "    test['Price'] = np.round(np.expm1(x))\n",
    "    test.to_csv('lgb_' + cv_lgb + '.csv',index=False)\n",
    "    from pandas import ExcelWriter\n",
    "    writer = ExcelWriter('MH_v3_lgb.xlsx')\n",
    "    test[['Price']].to_excel(writer,'Sheet1')\n",
    "    writer.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
